1. Import Python Packages

To install the kernel used by NERSC-metatlas users, copy the following text to $HOME/.ipython/kernels/mass_spec_cori/kernel.json

{
 "argv": [
  "/global/common/software/m2650/python-cori/bin/python",
  "-m",
  "IPython.kernel",
  "-f",
  "{connection_file}"
 ],
 "env": {
    "PATH": "/global/common/software/m2650/python-cori/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
 },
 "display_name": "mass_spec_cori",
 "language": "python"
}

In [ ]:
%matplotlib notebook
import sys, os

#### add a path to your private code if not using production code ####
#sys.path.insert(0,"/global/homes/d/dgct/Repos/metatlas/") #where your private code is
######################################################################

from metatlas.helpers import dill2plots as dp
from metatlas.helpers import metatlas_get_data_helper_fun as ma_data
from metatlas.helpers import chromatograms_mp_plots as cp
from metatlas.helpers import chromplotplus as cpp
import metatlas.metatlas_objects as metob
from metatlas.helpers import mzmine_helpers as mzm

import qgrid

from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
from IPython.display import display

import time
import dill
import numpy as np
import multiprocessing as mp
import pandas as pd

import matplotlib.pyplot as plot

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)

In [ ]:
files = dp.get_metatlas_files(experiment = '20170725_SK_ECOFAB_RtExudDay_20170630',name = '%',most_recent = False)
df = metob.to_dataframe(files)
# df = df[df.username=='pasteur']
my_grid = qgrid.QGridWidget(df=df[['experiment','name','username','acquisition_time']])
my_grid.export()

2. Register LCMS Runs into categorical groups.

  • ### Select MetAtlas LCMS Runs by experiment and filename.
  • ### Create a "File-Info" sheet from the selected files.
    This sheet needs to be downloaded and filled in. The "File-Info" sheet is the exchange format we use to define the grouping membership for LCMS runs.

In [ ]:
dp.make_empty_fileinfo_sheet('/global/homes/b/bpb/Downloads/empty_fileinfo.tab',files)
  • ### Create metatlas groups from filled in file-info sheet Defining groups of files allows for the selection of sets of LCMS runs by specifying the group names. In addition, the group membership is preserved in the exported metatlas datasets; so the application of statistical methods based on grouped data is straightforward.

Your filled in sheet will look something like this:

mzml_filegroupdescription
.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_10_Run413.mzML20160531_KBL_C18_Vio_cells_384_Quad_1 
.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_11_Run415.mzML20160531_KBL_C18_Vio_cells_384_Quad_1 
.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_12_Run417.mzML20160531_KBL_C18_Vio_cells_384_Quad_1 
.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_1_Run395.mzML20160531_KBL_C18_Vio_cells_384_Quad_2 
.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_2_Run397.mzML20160531_KBL_C18_Vio_cells_384_Quad_2 
.../20160531_KBL_violacein_cells_384_final/20160531_C18_ACN_POS_MSMS_KBL_Qex_A_3_Run399.mzML20160531_KBL_C18_Vio_cells_384_Quad_2

A text description of each group is an optional field. These can be a few, short sentences that describe each group.


In [ ]:
g = dp.make_groups_from_fileinfo_sheet('/global/homes/b/bpb/Downloads/test_make_groups.txt',
                                       filetype='tab',
                                       store=True)

View the list of metatlas objects using "to_dataframe"


In [ ]:
metob.to_dataframe(g)

3. Create a new Atlas

  • ### From a spreadsheet This is by far the most common way to create a new Atlas in Metabolite Atlas. The columns the sheet must be exactly as what is seen here. In cases where there isn't a compound in the database, the "label" field below is used. Here is an example of what a sheet could look like.

labelrt_minrt_maxrt_peakmzmz_toleranceinchi_key
violacein 4.24.44.3344.10369135XAPNKXIRQFHCHN-QGOAFFKASA-N
deoxyviolacein (iso1 - main)4.754.94.8328.10877675OJUJNNKCVPCATE-QGOAFFKASA-N
tryptophan2.32.452.36205.09787765QIVBCDIJIAJPQS-VIFPVBQESA-N
deoxychromoviridans5.465.75605.2448215 
chromoviridans5.155.55.3621.2397365 
ABMBA4.724.884.8229.98115LCMZECCEEOQWLQ-UHFFFAOYSA-N

These tables can be csv or tab delimited text or excel spreadsheets.

There is a lookup table here of all compounds to get the inchi_key.

For old MetAtlas atlases, you can use Excel's "vlookup" function along with this lookup table to map the old names to valid inchi keys.

=VLOOKUP(H2,$A:$B,2,0) where $A:$B are columns containing name and inchi-key

This is a link to all the old compound identifications that were in the database prior to the refactoring in Mid June, 2016.


In [ ]:
dp = reload(dp)
myAtlas = dp.make_atlas_from_spreadsheet('/global/homes/b/bpb/Downloads/test_pos_mode_atlas.csv',
                                       'test_atlas_positive_mode',
                                       filetype='csv',
                                       sheetname='',
                                       polarity = 'positive',
                                       store=False,
                                      mz_tolerance = 20)

4. Select groups of files to operate on


In [ ]:
dp = reload(dp)
groups = dp.select_groups_for_analysis(name = 'test_groups_pos',
                                       most_recent = True,
                                       remove_empty = True,
                                       include_list = [], exclude_list = ['Ref','pellet','_Del_'])#['QC','Blank'])

In [ ]:
for g in groups:
    for f in g.items:
        print f.name

5. Select Atlas to use

  • ### Select by Atlas name

In [ ]:
#default ema atlas: EMA_pos%50447%
atlases = metob.retrieve('Atlas',name='test_atlas_positive_mode',username='*')
names = []
for i,a in enumerate(atlases):
    print(i,a.name,pd.to_datetime(a.last_modified,unit='s'),len(a.compound_identifications)
  • ### Alternative way to get atlas

In [ ]:
atlases = dp.get_metatlas_atlas(name='coelicolor_features_20170620',do_print = True,most_recent=False)
  • ### A list of atlases is returned by the cell above.

  • You must run the following cell to specify which Atlas you want to continue your analysis with (Even if only a single atlas is returned).

  • Make a dataframe from your atlas. Required for the new, faster data slicer


In [ ]:
myAtlas = atlases[0]
atlas_df = ma_data.make_atlas_df(myAtlas)
atlas_df['label'] = [cid.name for cid in myAtlas.compound_identifications]
print myAtlas.name
print myAtlas.username

Change the atlases name


In [ ]:
myAtlas.name = 'SK-ZZ-MAIZE-HILIC-NEG-EMA50447'

Remove flagged compounds from your atlas


In [ ]:
print(len(myAtlas.compound_identifications))
myAtlas.compound_identifications = [atlas_id for atlas_id in myAtlas.compound_identifications if atlas_id.description != 'remove']
print(len(myAtlas.compound_identifications))

Adjust RT using y = m x + b approach


In [ ]:
m = 0.99
b = 0.346
extra_rt = 0.2
for i,cpd_id in enumerate(myAtlas.compound_identifications):
    rt_min = myAtlas.compound_identifications[i].rt_references[0].rt_min
    rt_max = myAtlas.compound_identifications[i].rt_references[0].rt_max
    rt_peak = myAtlas.compound_identifications[i].rt_references[0].rt_peak
    myAtlas.compound_identifications[i].rt_references[0].rt_min = m * rt_min + b - extra_rt
    myAtlas.compound_identifications[i].rt_references[0].rt_peak = m * rt_peak + b
    myAtlas.compound_identifications[i].rt_references[0].rt_max = m * rt_max + b + extra_rt

Store the updated Atlas


In [ ]:
metob.store([myAtlas])

Take a look at the selected atlas

View only the first few lines

View the atlas as a qgrid widget (good for widescreen displays)

View the atlas as a pandas dataframe (tweak the pandas display options to fit)


In [ ]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 100)
atlas_df.head(8000)

6. Get Data


In [ ]:
all_files = []
for my_group in groups:
    for my_file in my_group.items:
        all_files.append((my_file,my_group,atlas_df,myAtlas))
        
pool = mp.Pool(processes=min(4, len(all_files)))
t0 = time.time()
metatlas_dataset = pool.map(ma_data.get_data_for_atlas_df_and_file, all_files)
pool.close()
pool.terminate()
#If you're code crashes here, make sure to terminate any processes left open.
print time.time() - t0

Select parameters to split atlas_df and metatlas_dataset by


In [ ]:
###
# Parameter meanings:
# each parameter is compared to best scoring metric for each compound
# across all files
###
# 'min_intensity' <= highest intensity across all files for given compound
# 'rt_tolerance' >= shift of median RT across all files for given compound to reference
# 'mz_tolerance' >= ppm of median mz across all files for given compound relative to reference
# 'min_msms_score' <= highest compound dot-product score across all files for given compound relative to reference
# 'min_num_frag_matches' <= number of matching mzs when calculating max_msms_score
# 'min_relative_frag_intensity' <= ratio of second highest to first highest intensity of matching sample mzs


###
# Custom
###
# kwargs = {'min_intensity': ,
#           'rt_tolerance': ,
#           'mz_tolerance': ,
#           'min_msms_score': ,
#           'allow_no_msms': ,
#           'min_num_frag_matches': ,
#           'min_relative_frag_intensity': }


###
# Loose
###
# kwargs = {'min_intensity': 1e3,
#           'rt_tolerance': .25,
#           'mz_tolerance': 25,
#           'min_msms_score': 0.3, 'allow_no_msms': True,
#           'min_num_frag_matches': 1,  'min_relative_frag_intensity': .01}


###
# Strict
###
# kwargs = {'min_intensity': 1e5,
#           'rt_tolerance': .25,
#           'mz_tolerance': 5,
#           'min_msms_score': .6, 'allow_no_msms': False,
#           'min_num_frag_matches': 3, 'min_relative_frag_intensity': .1}

Create scores_df, and split atlas and metatlas_dataset according to previously selected parameters


In [ ]:
scores_df = fa.make_scores_df(metatlas_dataset)
scores_df['passing'] = fa.test_scores_df(scores_df, **kwargs)

pass_atlas_df, fail_atlas_df, pass_dataset, fail_dataset = filter_atlas_and_dataset(scores_df, atlas_df, metatlas_dataset,
                                                                                    column='passing')

Output scores_df to csv


In [ ]:
scores_df.to_csv(os.path.join(output_dir, 'compound_scores.csv'))

Save new filtered atlas(es) to database


In [ ]:
filtered_atlas = dp.make_atlas_from_spreadsheet(pass_atlas_df,
                                                atlas_name='positve_filtered_atlas_name'
                                                filetype='dataframe',
                                                polarity='positive', #Fill in the polarity here ('positive' or 'negative')
                                                store=False,
                                                mz_tolerance = 20)

7a. Adjust Retention Times.


In [ ]:
a = dp.adjust_rt_for_selected_compound(metatlas_dataset,compound_idx=0,include_lcmsruns = [],alpha=0.75,width=10,height=6)

8. Make Supplementary Tables

Specify a directory to put all the figures into


In [ ]:
output_dir = '/global/homes/b/bpb/Downloads/coelicolor_features_20170727_MAGI_paper/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
  • ### Export Atlas to a Spreadsheet

In [ ]:
atlas_identifications = dp.export_atlas_to_spreadsheet(myAtlas,os.path.join(output_dir,'atlas_export.csv'))
  • ### Dataframes and spreadsheets

In [ ]:
dp = reload(dp)
peak_height = dp.make_output_dataframe(input_fname = '',input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_height' , output_loc=os.path.join(output_dir,'sheets'))
peak_area = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_area' , output_loc=os.path.join(output_dir,'sheets'))
mz_peak = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_peak' , output_loc=os.path.join(output_dir,'sheets'))
rt_peak = dp.make_output_dataframe(input_fname = my_file, input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [],fieldname='rt_peak' , output_loc=os.path.join(output_dir,'sheets'))
mz_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='mz_centroid' , output_loc=os.path.join(output_dir,'sheets'))
rt_centroid = dp.make_output_dataframe(input_fname = my_file,input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='rt_centroid' , output_loc=os.path.join(output_dir,'sheets'))

Ideas to help with downstream analysis


In [ ]:
df = peak_height.copy()
df.columns = df.columns.droplevel(1)
df.fillna(0,inplace=True)
groups = df.columns.unique()

In [ ]:
df = peak_height.copy()
df.columns = df.columns.droplevel(1)
#filter numbers less than threshold with zero
df.fillna(0,inplace=True)
#only do calculation for non-zero values
df_stats = df.transpose().groupby('group').mean().transpose()
# df_stats = df.transpose().groupby('group').std().transpose()
# df_stats = df.transpose().groupby('group').count().transpose()

df_stats.reset_index(drop=True,inplace=True)
df_stats['label'] = [cid.name for cid in myAtlas.compound_identifications]
df_stats.to_csv('stats.csv')

In [ ]:
df_stats = df.transpose().groupby('group').mean().transpose()
df_stats.reset_index(drop=True,inplace=True)
df_stats['fold_change'] = np.log2(np.divide(df_stats[groups[1]],df_stats[groups[0]]))
# print df_stats[df_stats.fold_change>-2].index.tolist()
from scipy.stats import ttest_ind

cat1 = df[[c for c in df.columns if c == groups[0]]]
cat2 = df[[c for c in df.columns if c == groups[1]]]


df_stats['t-score'],df_stats['p-value'] = ttest_ind(cat1,cat2,axis=1)
df_stats['label'] = [cid.name for cid in myAtlas.compound_identifications]

In [ ]:
df_stats.to_csv('/global/homes/b/bpb/Downloads/magi_coelicolor_stats.csv')

In [ ]:
print(df_stats[df_stats.fold_change<-1].index.tolist())
  • ### Error bar

In [ ]:
dp = reload(dp)
dp.plot_errorbar_plots(peak_height, output_loc=os.path.join(output_dir,'error_bar_peak_height'))

In [ ]:
df.head(10)

In [ ]:
dp.plot_errorbar_plots(rt_peak, output_loc=os.path.join(output_dir,'error_bar_rt_peak'))
  • ### Chromatograms

Faster Chromatogram plots (beta)


In [ ]:
## THINGS YOU MIGHT WANT TO CHANGE
group = None # 'page' or 'index' or None
save = True
share_y = True

## THINGS YOU PROBABLY DON'T WANT TO CHANGE
file_names = ma_data.get_file_names(metatlas_dataset)
compound_names = ma_data.get_compound_names(metatlas_dataset)[0]
args_list = []

chromatogram_str = 'compound_chromatograms'

if not os.path.exists(os.path.join(output_dir,chromatogram_str)):
    os.makedirs(os.path.join(output_dir,chromatogram_str))

for compound_idx, my_compound in enumerate(compound_names):
    my_data = list()
    for file_idx, my_file in enumerate(file_names):
        my_data.append(metatlas_dataset[file_idx][compound_idx])
    kwargs = {'data': my_data,
             'file_name': os.path.join(output_dir, chromatogram_str, my_compound+'.pdf'),
             'group': group,
             'save': save,
             'share_y': share_y,
             'names': file_names}
    args_list.append(kwargs)
max_processes = 4
pool = mp.Pool(processes=min(max_processes, len(metatlas_dataset[0])))
pool.map(cpp.chromplotplus, args_list)
pool.close()
pool.terminate()

Make a plot for each compound:

  • Each lcmsrun will be a subplot

In [ ]:
## THINGS YOU MIGHT WANT TO CHANGE
nCols = 8
share_y = True
## THINGS YOU PROBABLY DON'T WANT TO CHANGE
file_names = ma_data.get_file_names(metatlas_dataset)
compound_names = ma_data.get_compound_names(metatlas_dataset)[0]
nRows = int(np.ceil(len(file_names)/float(nCols)))
args_list = []

chromatogram_str = 'compound_chromatograms'

if not os.path.exists(os.path.join(output_dir,chromatogram_str)):
    os.makedirs(os.path.join(output_dir,chromatogram_str))

for compound_idx, my_compound in enumerate(compound_names):
    my_data = list()
    for file_idx, my_file in enumerate(file_names):
        my_data.append(metatlas_dataset[file_idx][compound_idx])
    kwargs = {'data': my_data,
              'file_name': os.path.join(output_dir, chromatogram_str, my_compound+'.pdf'),
              'rowscols': (nRows, nCols),
              'share_y': share_y,
              'names': file_names}
    args_list.append(kwargs)
max_processes = 4
pool = mp.Pool(processes=min(max_processes, len(metatlas_dataset[0])))
pool.map(cp.plot_compounds_and_files_mp, args_list)
pool.close()
pool.terminate()

Make a plot for each lcmsrun

  • Each compound will be a subplot

In [ ]:
## THINGS YOU MIGHT WANT TO CHANGE
nCols = 8
share_y = False

chromatogram_str = 'lcmsrun_chromatograms'

if not os.path.exists(os.path.join(output_dir,chromatogram_str)):
    os.makedirs(os.path.join(output_dir,chromatogram_str))

## THINGS YOU PROBABLY DON'T WANT TO CHANGE
file_names = ma_data.get_file_names(metatlas_dataset)
compound_names = ma_data.get_compound_names(metatlas_dataset)[0]
nRows = int(np.ceil(len(compound_names)/float(nCols)))
args_list = []
for file_idx, my_file in enumerate(file_names):
    kwargs = {'data': metatlas_dataset[file_idx],
              'file_name': os.path.join(output_dir, chromatogram_str, my_compound+'.pdf'),
              'rowscols': (nRows, nCols),
              'share_y': share_y,
              'names': compound_names}
    args_list.append(kwargs)

max_processes = 4
pool = mp.Pool(processes=min(max_processes, len(metatlas_dataset)))
pool.map(cp.plot_compounds_and_files_mp, args_list)
pool.close()
pool.terminate()
  • ### Identification Figures

In [ ]:
dp = reload(dp)
dp.make_identification_figure(input_dataset = metatlas_dataset, input_fname = my_file, include_lcmsruns = [],exclude_lcmsruns = ['InjBl','QC','Blank','blank'], output_loc=os.path.join(output_dir,'identification'))

Make a single tar file of your output directory


In [ ]:
import time
import os
timestr = time.strftime("%Y%m%d-%H%M%S")
tarball_name = timestr + '_' + os.path.basename(os.path.normpath(output_dir)) + '.tar.gz'
%system tar -zcf $tarball_name -C $output_dir .
print 'done'
from IPython.core.display import display, HTML
f1 = '/user/bpb/files'+os.path.join(os.getcwd().replace('/global/u2/b/bpb',''), tarball_name)
f2 = tarball_name
display(HTML('<a href="%s" download="%s">Start automatic download!</a>'%(f1,f2)))

The tarball will be stored in your current directory. Run this to see current directory


In [ ]:
%system pwd

9. Compare feature EICs to BPC of each file


In [ ]:
import matplotlib.pyplot as plt
from textwrap import wrap
files = groups[0].items
ma_data = reload(ma_data)
for f in files:
    bpc = ma_data.get_bpc(f.hdf5_file,dataset='ms1_neg')
    fig = plt.figure(figsize=(10,6))
    plt.plot(bpc.rt,bpc.i,'k-')
#     for d in metatlas_dataset[file_index]:
#         plt.plot(d['data']['eic']['rt'],d['data']['eic']['intensity'],c=np.random.rand(3,1),alpha=0.9)
    ax = plt.gca()
    ax.set_yscale('log')
    ax.set_title('\n'.join(wrap(base_file_names[file_index],50)))
    ax.set_xlabel('Retention Time (min)')
    ax.set_ylabel('Intensity')
    plt.xlim(0,23)
    plt.ylim(1e5,1e10)
#     fig.savefig('/global/homes/b/bpb/trent/' + base_file_names[file_index].split('.')[0] + '.png')

In [ ]:
import matplotlib.pyplot as plt
from textwrap import wrap

file_index = 0
yscale = 'log'


full_file_names = ma_data.get_file_names(metatlas_dataset,full_path=True)
base_file_names = ma_data.get_file_names(metatlas_dataset,full_path=False)

for file_index in range(len(full_file_names)):

    bpc = ma_data.get_bpc(full_file_names[file_index],dataset='ms1_neg')
    if not os.path.exists(os.path.join(output_dir,'bpc_eic' )):
        os.makedirs(os.path.join(output_dir,'bpc_eic'))
    fig = plt.figure(figsize=(20,6))
    plt.plot(bpc.rt,bpc.i,'k-')
    for d in metatlas_dataset[file_index]:
        plt.plot(d['data']['eic']['rt'],d['data']['eic']['intensity'],c=np.random.rand(3,1),alpha=0.9)
    ax = plt.gca()
    ax.set_yscale(yscale)
    ax.set_title('\n'.join(wrap(base_file_names[file_index],50)))
    ax.set_xlabel('Retention Time (min)')
    ax.set_ylabel('Intensity')
    plt.xlim(0,23)
    plt.ylim(1e5,1e10)
    fig.savefig(os.path.join(output_dir,'bpc_eic', base_file_names[file_index].split('.')[0] + '.png'))

In [ ]:
ma_data = reload(ma_data)
f_list = [ma_data.compare_EIC_to_BPC_for_file(metatlas_dataset,idx,yscale ='log') for idx in range(10)]

In [ ]:
f_list[2]

In [ ]:
full_file_names = ma_data.get_file_names(metatlas_dataset,full_path=True)
base_file_names = ma_data.get_file_names(metatlas_dataset,full_path=False)
bpc = ma_data.get_bpc(full_file_names[0])

In [ ]:
counts = bpc.mz.round(4).value_counts(normalize=False, sort=True, ascending=False, bins=None)

In [ ]:
counts.head(10)

In [ ]:
from matplotlib import pyplot as plt

f = bpc.mz.hist(bins=1000)
plt.show(f)

10. Clean up Zombie Processes

  • ### These are not meant to be used as part of normal work

  • ### If code crashes, we will have to use these tools to clean things up

Try simply closing the pool


In [ ]:
pool.close()
pool.terminate()

Make a DataFrame of user's processes


In [ ]:
import os
import psutil
import getpass
import pandas as pd
from datetime import datetime

pids = [int(pid) for pid in os.listdir('/proc') if pid.isdigit()]
proc_df = []
for pid in pids:
    try:
        process = psutil.Process(pid)
        if process.username() == getpass.getuser():
            temp = {'pid': process.pid,
                    'name': process.name(),
                    'user': process.username(),
                    'created_timestamp': int(process.create_time()*100),
                    'created_datestr': str(datetime.fromtimestamp(process.create_time()))}
            proc_df.append(temp)
    except:
        pass
    
df = pd.DataFrame(proc_df)
df

Kill process by process id (pid)


In [ ]:
for pid in df[df.created_timestamp > 147939907007].pid:
    p = psutil.Process(pid)
    p.terminate()

In [ ]:
p = psutil.Process(43671)
p.terminate()

11. Run an MZMine Batch Script


In [ ]:
files = metob.retrieve('lcmsruns',experiment='%jd_of%',name='%_pos_%',username='*')
mzml_files = []
print(len(files))
for f in files:
    mzml_files.append(f.mzml_file)

In [ ]:
import glob
files = glob.glob('/project/projectdirs/metatlas/raw_data/smkosina/20170317_SK_Arkin_PseudoAbxCsource/*.mzML')
mzml_files = []
for f in files:

    if not '_InjBl' in f:
        if ('_' in f) or ('Sterile' in f):
            if 'NEG_' in f:
                mzml_files.append(f)#f.replace('20170317_SK-MdR_Arkin_PseudoAbxCsource_QE144_EPC18-USDAY26531_MSMS_','')))
# files



mzml_files = sorted(mzml_files)

In [ ]:
mzml_files = []
for g in groups:
    for f in g.items:
        mzml_files.append(f.mzml_file)

In [ ]:
# mzml_files = ['/project/projectdirs/metatlas/raw_data/kblouie/20150914_actinorhodin_finalset_50mm/20150910_C18_MeOH_NEG_MSMS_Scoelicolor_media_WT_M145_Day6_3of4___Run61.mzML']

In [ ]:
new_str = 'PseudoC_C18_NEG'
# C18_MSMS_NEG_Secondary_Metabolite_Parameters
mzm.make_mzmine_scripts(mzml_files, 
                        outfile='/project/projectdirs/metatlas/projects/mzmine_parameters/%s_mzmine_output.csv'%new_str,
                        new_batch_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/%s_job_script_parameters.xml'%new_str,
                        new_sbatch_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/%s_mzmine_job.sbatch'%new_str,
                        new_qsub_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/%s_mzmine_job.qsub'%new_str,
                        base_batch_file='/project/projectdirs/metatlas/projects/mzmine_parameters/C18_MSMS_NEG_Secondary_Metabolite_Parameters.xml',
                        base_sbatch_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/mzmine_job.sbatch',
                        base_qsub_file = '/project/projectdirs/metatlas/projects/mzmine_parameters/mzmine_job.qsub')

In [ ]:
from metatlas.helpers import mzmine_helpers as mzm
# mollaretii_P10_vs_0_output.csv
# mzmine_out  = '/project/projectdirs/metatlas/projects/mzmine_parameters/pathway_1_mzmine_output.csv'
mzmine_out = '/global/project/projectdirs/metatlas/projects/mzmine_parameters/dangl_exudate_C18_pos_mzmine_output.csv'
# mzmine_out = '/global/project/projectdirs/metatlas/projects/mzmine_parameters/RootExu_C18_neg_mzmine_output.csv'
atlas_df,myAtlas = mzm.metatlas_formatted_atlas_from_mzmine_output(mzmine_out,
                                                               'positive',
                                                               make_atlas=False,
                                                               atlas_name='20161117_test_mzmine_atlas_pos')

In [ ]:
atlas_df.head()

In [ ]:


In [ ]:
df = atlas_df.copy()
df = df.fillna(0)
df.set_index(['label','mz','mz_tolerance','rt_peak','rt_min','rt_max','inchi_key','detected_polarity','max_intensity'],inplace=True,)
df.head()

In [ ]:
df = df.sort_index(axis=1)

In [ ]:
df.columns

In [ ]:
#add group info
group_level = []
for c in df.columns:
    temp = ''
    for g in groups:
        for item in g.items:
            if c.replace(' Peak height','') in item.name:
                temp = g.name
#     split_name = temp.split('_')
#     split_name.extend([''] * (4 - len(split_name)))
    group_level.append([temp,c.replace(' Peak height','')])
#     print temp
#     group_level.append([temp])
list_of_lists = map(list, zip(*group_level))
# list_of_lists = [list(elem) for elem in group_level]
# df.columns = pd.MultiIndex.from_arrays(list_of_lists,names=('project','pathway','strain_code','iptg','species'))
df.columns = pd.MultiIndex.from_arrays(list_of_lists,names=('group','file'))
df.head()
# print len(group_level),len(atlas_df.columns)
# group_level

In [ ]:
df.to_csv('~/Downloads/dangle_c18_pos.csv')

In [ ]:
(target-ppm,target+ppm)

In [ ]:
mz = 363.2394162
target = mz + 1.007276
ppm = target * 15 / 1e6
cpds = metob.database.query('select * from compounds where mono_isotopic_molecular_weight between %.4f and %.4f'%(target-ppm,target+ppm))
results = [c for c in cpds]
pd.DataFrame(results)

In [ ]:
638.4049378 - 1.007276

In [ ]:
cpds = metob.retrieve('Compounds',mono_isotopic_molecular_weight = '637.39%')
for c in cpds:
    print c.name,(c.mono_isotopic_molecular_weight + 1.007276 - 638.4049378) / 638.4049378 * 1e6

In [ ]:
import matplotlib.pyplot as plt
from textwrap import wrap

fig,ax = plt.subplots(4,6,figsize=(20,13))

for i,s in enumerate(df.columns.get_level_values('species').unique()):
    idx = np.unravel_index(i,(4,6)) #convert to row column index
    xdata = df.xs(('Pathway=0',s),level=['pathway','species'],axis=1).max(axis=1)
    ydata = df.xs(('Pathway=1',s),level=['pathway','species'],axis=1).max(axis=1)
    ax[idx].plot(xdata.fillna(0)+1,ydata.fillna(0)+1,'.',markersize=20)
    ax[idx].set_title('\n'.join(wrap(s,18)))
    ax[idx].set_xlabel('wild type')
    ax[idx].set_ylabel('engineered')
    ax[idx].set_yscale('log')
    ax[idx].set_xscale('log')

    plt.show()
plt.tight_layout()

In [ ]:
df_signal = pd.DataFrame()
for i,s in enumerate(df.columns.get_level_values('species').unique()):
    idx = np.unravel_index(i,(4,6)) #convert to row column index
    xdata = df.xs(('Pathway=0',s),level=['pathway','species'],axis=1).max(axis=1)
    ydata = df.xs(('Pathway=1',s),level=['pathway','species'],axis=1).max(axis=1)
    df_signal[s] = (xdata<1e5) & (ydata>1e7)
df_signal['product count'] = df_signal.sum(axis=1)
df_signal = df_signal.sort_values('product count',ascending=False)
# for i,s in enumerate(df.columns.get_level_values('species').unique()):
#     df_signal = df_signal.drop(s, 1)
# df_signal

In [ ]:
fig = plt.figure(figsize= (7,7))
ax = plt.hist(df_signal['product count'],bins=24)
plt.xlabel('Number of species')
plt.ylabel('Number of features')
plt.show()

In [ ]:
df_signal.reset_index(inplace=True)
#add intensity
df_signal[(df_signal['product count']>2) & (df_signal['mz']>200)].sort_values('max_intensity',ascending=False).to_csv('Pathway_1_Products.csv')

In [ ]:
print df.index.name
print df.index.get_values()[0]

In [ ]:
# 223,217,396
filter_col = [col for col in list(atlas_df) if 'Peak height' in col if '_P0_' in col if '223' in col]
atlas_df['P0 Intensity'] = atlas_df[filter_col].max(axis=1)
filter_col = [col for col in list(atlas_df) if 'Peak height' in col if '_P10_' in col if '223' in col]
atlas_df['P10 Intensity'] = atlas_df[filter_col].max(axis=1)
filter_col

In [ ]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(5,5))
ax = fig.gca()
plt.plot(atlas_df['P0 Intensity']+1,atlas_df['P10 Intensity']+1,'.',markersize=20)
plt.xlabel('Wild Type')
plt.ylabel('Engineered Strain')
ax.set_xscale('log')
ax.set_yscale('log')
plt.show()

In [ ]:
df = atlas_df[(atlas_df['P0 Intensity']<1e5)]
df = df.sort_values('P10 Intensity',ascending=False)
df.head(20)

In [ ]:
print atlas_df.shape
print myAtlas.name
atlas_df.head()

Remove compounds from an atlas where peak height less than cutoff


In [ ]:
m = peak_height.max(axis=1)
print len(m)

Ancient Codes and Partially Developed Tools


In [ ]:
### Store data to a pickle file
# saved_filename = '/global/homes/b/bpb/Downloads/20160818_POS_MO_HEfungusonly_V1.pkl'
# with open(output_filename,'w') as f:
#     dill.dump(metatlas_dataset,f)

### Load a pre-existing metatlas dataset  
# metatlas_dataset = ma_data.get_dill_data(saved_filename)

In [ ]:
### copy files to $SCRATCH
### You will likely never have to do this, but just in case, here is the code.
# from shutil import copyfile
# scratch = os.environ['SCRATCH']
# for my_group in groups:
#     for my_file in my_group.items:
#         new_path = os.path.join(scratch,'temp_metatlas')
#         if not os.path.isdir(new_path):
#             os.mkdir(new_path)
#         new_file = os.path.join(new_path,os.path.basename(my_file.hdf5_file))
#         copyfile(my_file.hdf5_file, new_file)
#         my_file.hdf5_file = new_file
#         print my_file.hdf5_file

In [ ]:
# %matplotlib inline
# dp = reload(dp)
# pickles = ['/global/homes/b/bpb/Downloads/KZ_Avena_Exudate_atlases_and_groups_1/neg_data.pkl',
#           '/global/homes/b/bpb/Downloads/KZ_Avena_Exudate_atlases_and_groups_1/pos_data.pkl',
# '/global/homes/b/bpb/Downloads/KZ_Avena_Uptake_atlases_and_group_2/pos_data.pkl',
# '/global/homes/b/bpb/Downloads/KZ_Avena_Uptake_atlases_and_group_2/neg_data.pkl']
# for p in pickles:
#     plot_location_label = p.split('.')[0]+'/'
#     print plot_location_label
#     if not os.path.exists(plot_location_label):
#         os.makedirs(plot_location_label)
#     metatlas_dataset = ma_data.get_dill_data(p)
#     dp.make_identification_figure(input_dataset = metatlas_dataset, input_fname = p, include_lcmsruns = [],exclude_lcmsruns = ['RootCass','QC','Blank','blank'], output_loc=plot_location_label+'/identification')

In [ ]:
######### DO NOT USE #######
# output_filename = '/global/homes/b/bpb/Downloads/20160531_KBL_C18_Vio_cells_384_Q_1_to_4.pkl'
# data = dp.get_data_for_groups_and_atlas(groups,myAtlas,output_filename)
############################

In [ ]:
######### DO NOT USE #######
### THIS STILL NEEDS SOME REPAIRS ###
# rt_corrector.display_atlases()
### USE AT YOUR OWN RISK ###
############################

In [ ]:
# msmls_files = metob.retrieve('Lcmsruns',experiment = '20161007_KBL_MPZHILIC3um_MSMLS_stds',name = '%pos%',username = '*')
# print len(msmls_files)
# kate_files = metob.retrieve('Lcmsruns',experiment = '20161007_KBL_MPZHILIC3um_KateStandards',name = '%pos%', username = '*')
# print len(kate_files)
# g = metob.Group()
# g.name = '20161007_MP3umZHILIC_V12_POS_MetIDJamboree'
# for f in msmls_files:
#     g.items.append(f)
# for f in kate_files:
#     g.items.append(f)
# metob.store([g])

In [ ]:
# Keep compounds that are removed and let the project know that this compound is below the intensity requirement.
peak_height = dp.make_output_dataframe(input_fname = '',input_dataset = metatlas_dataset,include_lcmsruns = [],exclude_lcmsruns = [], fieldname='peak_height' , output_loc=os.path.join(output_dir,'sheets'))
min_intensity = 1e5
to_drop = []
peak_height.max(axis=1) > min_intensity
ids = [myAtlas.compound_identifications[i] for i,b in enumerate(peak_height.max(axis=1) > min_intensity) if b == True]
print(len(ids))
myAtlas.compound_identifications = ids
metob.store(myAtlas)

In [ ]: